library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Library to import the data
library(readr)
# This dataset on Video games sales is downloaded from Kaggle.
# Dataset link - https://www.kaggle.com/datasets/rush4ratio/video-game-sales-with-ratings
df <- read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")
This dataset is prepared by web scraping the website ‘Metacritic’. This dataset is actually a combination of two datasets, one had the sales related information of the games and the second dataset has included game ratings by users and critics.
These are the variables included:
Name: Title of the video game
Platform: Video gaming console on which the game is released.
Year_of_Release: The year when the video game was released.
Genre: The genre of the game.
Publisher: The publisher who published the game.
Developer: The Developer who developed the game.
NA_Sales: Sales in North America.
EU_Sales: Sales in Europe.
JP_Sales: Sales in Japan.
Other_Sales: Sales in other countries.
Global_Sales: Overall sales of the title.
Critic_Score: Aggregate score compiled by Metacritic staff.
Critic_Count: The number or critics used in coming with the critic score.
User_Score: Score by Metacritics subscribers.
Rating: The ESRB rating of the game.
User Count: Number of users who gave the user_score.
# summary of the dataset
summary(df)
## Name Platform Year_of_Release Genre
## Length:16719 Length:16719 Length:16719 Length:16719
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Publisher NA_Sales EU_Sales JP_Sales
## Length:16719 Min. : 0.0000 Min. : 0.000 Min. : 0.0000
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Mode :character Median : 0.0800 Median : 0.020 Median : 0.0000
## Mean : 0.2633 Mean : 0.145 Mean : 0.0776
## 3rd Qu.: 0.2400 3rd Qu.: 0.110 3rd Qu.: 0.0400
## Max. :41.3600 Max. :28.960 Max. :10.2200
##
## Other_Sales Global_Sales Critic_Score Critic_Count
## Min. : 0.00000 Min. : 0.0100 Min. :13.00 Min. : 3.00
## 1st Qu.: 0.00000 1st Qu.: 0.0600 1st Qu.:60.00 1st Qu.: 12.00
## Median : 0.01000 Median : 0.1700 Median :71.00 Median : 21.00
## Mean : 0.04733 Mean : 0.5335 Mean :68.97 Mean : 26.36
## 3rd Qu.: 0.03000 3rd Qu.: 0.4700 3rd Qu.:79.00 3rd Qu.: 36.00
## Max. :10.57000 Max. :82.5300 Max. :98.00 Max. :113.00
## NA's :8582 NA's :8582
## User_Score User_Count Developer Rating
## Length:16719 Min. : 4.0 Length:16719 Length:16719
## Class :character 1st Qu.: 10.0 Class :character Class :character
## Mode :character Median : 24.0 Mode :character Mode :character
## Mean : 162.2
## 3rd Qu.: 81.0
## Max. :10665.0
## NA's :9129
Data pre processing
df_cleaned <- na.omit(df)
df_cleaned
# Get the class of each column
sapply(df_cleaned, class)
## Name Platform Year_of_Release Genre Publisher
## "character" "character" "character" "character" "character"
## NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
## "numeric" "numeric" "numeric" "numeric" "numeric"
## Critic_Score Critic_Count User_Score User_Count Developer
## "numeric" "numeric" "character" "numeric" "character"
## Rating
## "character"
unique(df_cleaned$Year_of_Release)
## [1] "2006" "2008" "2009" "2005" "2007" "2010" "2013" "2004" "2002" "2001"
## [11] "2011" "2012" "2014" "1997" "1999" "2015" "2016" "2003" "1998" "1996"
## [21] "2000" "N/A" "1994" "1985" "1992" "1988"
sum(df_cleaned$Name=="N/A")
## [1] 0
sum(df_cleaned$Platform=="N/A")
## [1] 0
sum(df_cleaned$Genre=="N/A")
## [1] 0
sum(df_cleaned$Publisher=="N/A")
## [1] 4
sum(df_cleaned$User_Score=="N/A")
## [1] 0
sum(df_cleaned$Developer=="N/A")
## [1] 0
sum(df_cleaned$Rating=="N/A")
## [1] 0
sum(df_cleaned$Year_of_Release=="N/A")
## [1] 121
df_cleaned <- df_cleaned[df_cleaned$Year_of_Release != "N/A",]
df_cleaned <- df_cleaned[df_cleaned$Publisher != "N/A",]
df_cleaned$Year_of_Release <- as.integer(df_cleaned$Year_of_Release)
df_cleaned$User_Score <- as.integer(df_cleaned$User_Score)
summary(df_cleaned$Critic_Score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 62.00 72.00 70.27 80.00 98.00
summary(df_cleaned$User_Score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 7.000 6.737 8.000 9.000
# So we are converting the critic score range from 0 to 10.
df_cleaned$Critic_Score <- df_cleaned$Critic_Score / 10
summary(df_cleaned$Critic_Score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.300 6.200 7.200 7.027 8.000 9.800
summary(df_cleaned$User_Score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 7.000 6.737 8.000 9.000
duplicates <- df_cleaned[duplicated(df_cleaned), ]
# Calculate the count of duplicates
count_of_duplicates <- nrow(duplicates)
count_of_duplicates
## [1] 0
counts <- table(df_cleaned$Rating)
counts
##
## AO E E10+ K-A M RP T
## 1 2082 930 1 1433 1 2377
df_filtered <- df_cleaned[df_cleaned$Rating != "AO",]
df_filtered <- df_filtered[df_filtered$Rating != "RP",]
df_filtered <- df_filtered[df_filtered$Rating != "K-A",]
counts <- table(df_filtered$Rating)
counts
##
## E E10+ M T
## 2082 930 1433 2377
Problem 2 Visualize something interesting to you from the dataset using ggplot2. Comment on what the visualization shows and any key conclusions.
# Importing the library ggplot2 for plotting
library(ggplot2)
ggplot(data=df_filtered)+
geom_bar(aes(x=Platform,color=Platform,fill=Platform)) +
theme(legend.position="bottom") +
labs(title = "PS2 has the highest number of video games made",
x = "Platform",
y = "Total number of games")
Observation:
ggplot(data=df_filtered)+
geom_bar(aes(x=Year_of_Release)) +
theme(legend.position="bottom") +
scale_x_continuous(limits = c(1980, 2020),
breaks = seq(1980, 2020, by = 5)) +
labs(title = "Year 2008 and 2007 has the highest number of video games released.",
x = "Year_of_Release",
y = "Total number of games")
Observations:
Year 2007-2008 had the highest number of games released, with a combined total of around 1180 games.
We can see a sudden jump in the number of games released after 2000, the trend goes negative after 2008.
ggplot(data=df_filtered,fig.align="center",fig.width = 50)+
geom_bar(aes(x=Genre)) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Action genre is the most famous genre for video games",
x = "Video Game Genre",
y = "Total number of games")
Observation:
Genre Action has the highest number of games released, followed by genre Sports.
Genre Puzzle has the least number of games released.
ggplot(data=df_filtered,fig.align="center",fig.width = 50,aes(x=Genre))+
geom_bar(data = subset(df_filtered, Genre == 'Action')) + facet_wrap(~Year_of_Release) +
theme(legend.position="bottom") +
labs(title = "Year 2009 has the highest number of Action games released.",
x = "Action genre for each year",
y = "Total number of games")
Observation:
xy <- sort(table(df_filtered$Publisher),decreasing = TRUE)
publisher_counts <- sort(table(df_filtered$Publisher), decreasing = TRUE)
top_publishers <- names(publisher_counts)[1:15]
df_filtered2 <- df_filtered[df_filtered$Publisher %in% top_publishers, ]
top_publishers
## [1] "Electronic Arts" "Ubisoft"
## [3] "Activision" "Sony Computer Entertainment"
## [5] "THQ" "Nintendo"
## [7] "Sega" "Take-Two Interactive"
## [9] "Konami Digital Entertainment" "Namco Bandai Games"
## [11] "Capcom" "Atari"
## [13] "Microsoft Game Studios" "Tecmo Koei"
## [15] "Square Enix"
Plotting a bar graph to visualize the data for top 15 publishers.
ggplot(data=df_filtered2,aes(x=Publisher))+
geom_bar() +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Electronic Arts is the most popular publisher of video games.",
x = "Publishers",
y = "Total number of games")
Observations:
df_ps2 <- df_filtered2[df_filtered2$Platform == "PS2",]
So now we have all the data with platform as PS2 and for the top 15 publishers.
ggplot(data=df_ps2,fig.align="center",fig.width = 50,aes(x=Publisher))+
geom_bar(aes(fill=Publisher)) + facet_wrap(~Platform) +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Electronic arts has the highest number of games published for PS2",
x = "Publishers",
y = "Total number of games") + theme(legend.key.size = unit(0.2, "cm"))
Observations:
Electronic arts is still the most famous video game publisher for the platform ‘PS2’.
Sony Computer Entertainment stands second which was expected as the platform ‘PS2’ is developed by Sony.
ggplot(data=df_filtered,aes(x=Critic_Score,y=Global_Sales))+ geom_smooth() +
theme(legend.position="bottom") +
scale_x_continuous(limits = c(0,10),
breaks = seq(0,10, by = 0.5)) +
theme(axis.text.x = element_text(size = 8, face = "bold")) +
labs(title = "Global Sales increases sharply after the critic score of 8.5",
x = "Critic Score",
y = "Global Sales in millions")
Observations:
Global Sales and Critic Score has a positive relationship, with sales increasing as the critic score increases.
There is a sharp rise in the global sales of the video games after the critic score of 8.5.
The least critic score is somewhere between 1.25 - 1.3 and the highest critic score is in the range 9.75 - 9.8 .
ggplot(data=df_filtered,aes(x=User_Score,y= Global_Sales))+
geom_smooth() +geom_jitter() + scale_y_log10() +
scale_x_continuous(limits = c(0,10),
breaks = seq(0,10, by = 0.5)) +
labs(title = "Global Sales increases as the User Score increases",
x = "User Score",
y = "Global Sales (log10 millions) ")
Observation:
Here also we see a general trend of global sales increasing as the user score increases.
There are very few games with a critic score of more than 8.5, whereas there are no games with user score more than 9.5. This tells that users are more strict in giving scores in compare to the critics.
The range 6.5 - 8.5, is the most dense.
df_filtered %>%
ggplot(aes(x=Genre, y= Global_Sales)) +
geom_boxplot() + scale_y_log10() +
stat_summary(fun = median, geom = "text",
aes(label = round(..y.., 2)),
vjust = -0.5, size = 3)+
theme(plot.title = element_text(hjust=0.5)) +
theme(axis.text.x = element_text(size = 10, angle = 90,face = "bold")) +
labs(title = "Genre Sports has the highest number of global sales.",
x = "Genre",
y = "Global Sales (log10 millions) ")
Observation:
Here we can see that genre like puzzle, strategy, and adventure have really low count of sales in compare to other genre.
Sports genre has the highest number of sales, with the median value around 0.39 in log10 millions.
ggplot(data=df_filtered,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= NA_Sales)) +
scale_x_continuous(limits = c(1980, 2020),
breaks = seq(1980, 2020, by = 2)) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Total sales in North America based on year of release",
x = " Year of Release",
y = "Sales in North America (millions)")
ggplot(data=df_filtered,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= EU_Sales)) +
theme(legend.position="bottom") +
scale_x_continuous(limits = c(1980, 2020),
breaks = seq(1980, 2020, by = 2)) +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold"))+
labs(title = "Total sales in Europe based on year of release",
x = " Year of Release",
y = "Sales in Europe (millions)")
ggplot(data=df_filtered,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= JP_Sales)) +
theme(legend.position="bottom") +
scale_x_continuous(limits = c(1980, 2020),
breaks = seq(1980, 2020, by = 2)) +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Total sales in Japan based on year of release",
x = " Year of Release",
y = "Sales in Japan (millions)")
ggplot(data=df_filtered,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= Other_Sales)) +
theme(legend.position="bottom") +
scale_x_continuous(limits = c(1980, 2020),
breaks = seq(1980, 2020, by = 2)) +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Total sales in other regions based on year of release",
x = " Year of Release",
y = "Sales in all other regions (millions)")
ggplot(data=df_filtered,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= Global_Sales)) +
theme(legend.position="bottom") +
scale_x_continuous(limits = c(1980, 2020),
breaks = seq(1980, 2020, by = 2)) +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Global sales based on year of release",
x = " Year of Release",
y = "Global Sales (millions)")
Observation:
Sales in North America, Europe and globally combined increased rapidly in the year 2005, which followed a sharp fall in the same year as well.
In Japan, there was a continue increase and decrease in the video games sales,with the highest games sold in the year 2005 - 2006.
Total Sales in other regions were highest in the year 2003 - 2004.
# we want to see how top 5 genres of global sales did in japan in the year 1996-2015
# so first have a subset of data of the games sold between the year 1995-2015
df_tp <- df_filtered[df_filtered$Year_of_Release> 1995,]
df_tp2 <- df_tp[df_tp$Year_of_Release<2016,]
# df_tp2 consists of data between the year 1995-2015
xy <- sort(table(df_tp2$Genre),decreasing = TRUE)
genre_counts <- sort(table(df_tp2$Genre), decreasing = TRUE)
top_genre <- names(genre_counts)[1:5]
df_tp3 <- df_tp2[df_tp2$Genre %in% top_genre, ]
# Now that we have top 5 genres played in the year 1995-2015, we can plot a graph to
# see how each of these genre did in Japan
ggplot(data=df_tp3,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= JP_Sales)) + facet_wrap(~Genre,nrow=3) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "Total sales in Japan based on year of release",
x = " Year of Release",
y = "Sales in Japan (millions)")
Observations:
Genre Role-Playing has the highest number of sales in Japan, with the highest number of games sold in the year 2008-2009.
Genre Shooting has the lowest number of sales in Japan, with the highest sold in the year 2015.
xy <- sort(table(df_filtered$Developer),decreasing = TRUE)
developer_counts <- sort(table(df_filtered$Developer), decreasing = TRUE)
top_developers <- names(developer_counts)[1:15]
df_filtered3 <- df_filtered[df_filtered$Developer %in% top_developers, ]
ggplot(data=df_filtered3,aes(x=Developer))+
geom_bar(aes(fill=Developer)) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "EA Canada is the most popular developer of video games.",
x = "Developers",
y = "Total number of games")
Observations:
dfEACanada <- df_filtered[df_filtered$Developer == "EA Canada",]
ggplot(data=dfEACanada,aes(x=Genre))+
geom_bar(aes(fill=Genre)) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "EA Canada has developed highest number of games in the genre Sports",
x = "Genre",
y = "Total number of games")
Observation:
plot <- ggplot(data=df_filtered,aes(x=Rating))+
geom_bar(aes(fill=Rating)) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 10, face = "bold")) +
labs(title = "ESRB rating 'T' has the highest number of games developed.",
x = "ESRB Rating",
y = "Total number of games") +
png("ESRBrating.png", width = 800, height = 600, res = 120)
print(plot)
dev.off()
## quartz_off_screen
## 2
Observations:
ESRB rating ‘T’ has the highest number of games developed.
ESRB rating ‘E10+’ has the lowest number of games developed.
plot<- ggplot(data=df_tp3,fig.align="center",fig.width = 50)+
geom_line(aes(x=Year_of_Release, y= Global_Sales)) + facet_wrap(~Rating) +
theme(legend.position="bottom") +
theme(axis.text.x = element_text(size = 8, angle = 90, face = "bold")) +
labs(title = "ESRB Rating 'E' has the highest global sales.",
x = " Year of Release",
y = "Global Sales (millions)")
png("ESRBratingSales.png", width = 800, height = 600, res = 120)
print(plot)
dev.off()
## quartz_off_screen
## 2
Observations:
ESRB rating ‘E’ has the highest global sales.
ESRB rating ‘T’ is the rating with the highest number of games developed, but they have very less sale.